HW 01

Author

Matt Osterhoudt

0 - Setup

if (!require("pacman")) 
  install.packages("pacman")

# use this line for installing/loading
 pacman::p_load(tidyverse,
                palmerpenguins,
                here,
                cowgrid) 
install.packages("openintro")
package 'openintro' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\matto\AppData\Local\Temp\RtmpcV01DQ\downloaded_packages
library(openintro)
 
devtools::install_github("tidyverse/dsbox")

ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

knitr::opts_chunk$set(
  fig.width = 7,
  fig.asp = .618,
  fig.retina = 3,
  fig.align = "center",
  dpi = 300
)

1 - Road traffic accidents in Edinburgh

accidents_data <- read_csv(here("data", "accidents.csv"))
##glimpse(accidents_data) #Quick glimpse of the data

#Converts day of week (monday, friday, etc.) to a weekend or weekday
accidents_data <- accidents_data |>
  mutate(weekday_or_weekend = 
           ifelse(day_of_week %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))

#Density plot
ggplot(accidents_data, aes(x = time, fill = severity)) +
  geom_density(alpha = 0.5, adjust = 1) +
  facet_wrap(~weekday_or_weekend, nrow=2) +
  scale_fill_manual(values = c("Fatal" = "#9370DB", "Serious" = "#87CEEB", "Slight" = "#FFFFE0")) +
  theme_minimal() +
  labs(
    x = "Time of Day",
    y = "Density",
    title = "Number of Accidents throughout the day\nBy day of week and severity"
  )

2 - NYC marathon winners

library("cowplot") #I used this library for plot_grid() to stack the plots.

nyc_marathon_data <- read_csv(here("data", "nyc_marathon.csv"))

#Quick glimpse of the data
#glimpse(nyc_marathon_data) 

#Code for the histogram
histogram = ggplot(nyc_marathon_data, aes(x = time)) + 
  geom_histogram(binwidth = 75)+
  labs(
    x = "Marathon Completion Time"
  )

#Code for the boxplot, removing the extra axis info so it looks cleaner stacked with histogram
boxplot = ggplot(nyc_marathon_data, aes(x = time)) + 
  geom_boxplot()+
  labs(title = "A. Boxplot and Histogram")+
  theme_minimal(base_size = 14)+
  theme(axis.line=element_blank(),
        axis.ticks=element_blank(),
        axis.title.y=element_blank(),
        axis.title.x=element_blank(),
        axis.text.y=element_blank())

#This function stacks the boxplot and histogram on top of each other
plot_grid(boxplot, histogram, align = "v", axis = "l", ncol = 1, heights = c(1, 3))

#Answer to Part A: The histogram gives a better visual of the distribution of runners that completed the marathon. We can see a more clear visual represenatation of the completion times. The boxplot, however, gives us a more clear indication of the median points, outliers, and quartiles. The boxplot lacks the visualization of the distrubution, while the histogram lacks the statistical information such as median, outliers, and quartiles.


#Part B Code
ggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +
  geom_boxplot()+
  scale_fill_manual(values = c("Men" = "Blue", "Women" = "Red"))+
  theme_minimal(base_size = 14)+
  labs(title = "Men vs Women Marathon Times",
       x = "Marathon Time by Hour",
       y = "Division by Gender")

#Answer to Part B: Compared to women, men have a distribution range of marathon times. Men also have a smaller quartile range, and on average, a faster completion time. Women have more outlier points.

#Part C Code
ggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +
  geom_boxplot(show.legend = FALSE,
               fill = "gray")+
  theme(axis.title.y=element_blank())+
  theme_minimal(base_size = 14)+
  labs(title = "Men vs Women Marathon Times",
       x = "Marathon Time by Hour"
  )

#Answer to Part C: The separation by Gender already speaks for itself a bit. I removed the legend and y-axis label. I also decided to remove the color and just use a default gray fill, although I suppose I could have kept the blue vs red/pink in. Those are sort of historical default "gender" color separators. My changes affect the data-to-ink ratio by cutting down a lot of excess colors and information in order to simplyify the visual.

#Part D Code
ggplot(nyc_marathon_data, aes(x = year, y = time_hrs, color = division, shape = division))+
  geom_point()+
  geom_line(aes(group = division))+ ##necessary to connect the points
  scale_color_manual(values = c("Men" = "Blue", "Women" = "Red"))+
  labs(title = "Marathon Times by Year",
      x = "Year",
      y = "Marathon Time",
      color = "Division",
      shape = "Division")+
  theme_minimal(base_size = 14)

#Answer to Part D: What is more visible in this plot compared to the others is the trend. We can visualize the marathon times by year, especially noting the marathon times improving significantly since 1970.


3 - US counties

#Setup
#This filters out the N/A values to remove any warning messages
county <- county
  filter(county,!is.na(homeownership), !is.na(poverty)) 
# A tibble: 3,140 × 15
   name           state pop2000 pop2010 pop2017 pop_change poverty homeownership
   <chr>          <fct>   <dbl>   <dbl>   <int>      <dbl>   <dbl>         <dbl>
 1 Autauga County Alab…   43671   54571   55504       1.48    13.7          77.5
 2 Baldwin County Alab…  140415  182265  212628       9.19    11.8          76.7
 3 Barbour County Alab…   29038   27457   25270      -6.22    27.2          68  
 4 Bibb County    Alab…   20826   22915   22668       0.73    15.2          82.9
 5 Blount County  Alab…   51024   57322   58013       0.68    15.6          82  
 6 Bullock County Alab…   11714   10914   10309      -2.28    28.5          76.9
 7 Butler County  Alab…   21399   20947   19825      -2.69    24.4          69  
 8 Calhoun County Alab…  112249  118572  114728      -1.51    18.6          70.7
 9 Chambers Coun… Alab…   36583   34215   33713      -1.2     18.8          71.4
10 Cherokee Coun… Alab…   23988   25989   25857      -0.6     16.1          77.5
# ℹ 3,130 more rows
# ℹ 7 more variables: multi_unit <dbl>, unemployment_rate <dbl>, metro <fct>,
#   median_edu <fct>, per_capita_income <dbl>, median_hh_income <int>,
#   smoking_ban <fct>
#Part A: The following code attempts to create a boxplot with categorical data of smoking ban in the x-axis and numerical pop in the y-axis. It also tries to layer this with a scatterplot of numerical median income data, and categorical median education data. This code will output a plot but it does not really work correctly nor does it make sense. First off, geom_point will create a scatterplot, which is expecting two continuous variables. median_edu is not a continuous variable, so this will cause issues. I think the boxplot is fine, but combining it with a scatterplot does not make sense and will lead to a confusing graphic. 
suppressWarnings(
  ggplot(county) +
    geom_point(aes(x = median_edu, y = median_hh_income))+
    geom_boxplot(aes(x = smoking_ban, y = pop2017)) 
)

#Part B: The second plot (the one where the median_edu columns are stacked vertically) makes it much easier to compare. The spacing is better which makes it easier to read. It's also easier to read the charts up from down rather than side to side. This means the placement of the faceting variable is dependant on what we want to compare. In this case, we want to see the poverty levels across people from different median education levels, so placing the median education levels in vertical columns makes the most sense.
suppressWarnings(
  ggplot(county %>% filter(!is.na(median_edu))) + 
    geom_point(aes(x = homeownership, y = poverty)) + 
    facet_grid(median_edu ~ .)
)

suppressWarnings(
  ggplot(county %>% filter(!is.na(median_edu))) + 
    geom_point(aes(x = homeownership, y = poverty)) + 
    facet_grid(. ~ median_edu)
)

#Part C

#Plot A:
ggplot(county, aes(x = homeownership, y = poverty))+
  geom_point(size = 2)+
  theme_gray()+
  labs(title = "Plot A",
       x = "homeownership",
       y = "poverty"
  )

#Plot B:
ggplot(county, aes(x = homeownership, y = poverty))+
  geom_point(size = 2)+
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue", se = FALSE)+
  theme_gray()+
  labs(title = "Plot B",
       x = "homeownership",
       y = "poverty"
  )

#Plot C:
ggplot(county, aes(x = homeownership, y = poverty, line = metro))+
  geom_point()+
  geom_smooth(se = FALSE, color = "green")+
  labs(title = "Plot C",
       x = "homeownership",
       y = "poverty"
  )

#Plot D:
ggplot(county, aes(x = homeownership, y = poverty, line = metro))+
  geom_smooth(se = FALSE, color = "blue")+
  geom_point()+
  labs(title = "Plot D",
       x = "homeownership",
       y = "poverty"
  )

#Plot E:
ggplot(county, aes(x = homeownership, y = poverty, line = metro))+
  geom_point(aes(color = metro), size = 2)+
  geom_smooth(aes(linetype = metro), se = FALSE)+
  scale_linetype_manual(values = c("no" = "solid", "yes" = "dashed"))+
  theme_gray()+
  labs(title = "Plot E",
       x = "homeownership",
       y = "poverty"
  )

#Plot F:
ggplot(county, aes(x = homeownership, y = poverty, line = metro))+
  geom_point(aes(color = metro), size = 2)+
  geom_smooth(aes(color = metro), se = FALSE)+
  theme_gray()+
  labs(title = "Plot F",
       x = "homeownership",
       y = "poverty"
  )

#Plot G:
ggplot(county, aes(x = homeownership, y = poverty))+
  geom_point(aes(color = metro), size = 2)+
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue", se = FALSE)+
  theme_gray()+
  labs(title = "Plot G",
       x = "homeownership",
       y = "poverty"
  )

#Plot H:
ggplot(county, aes(x = homeownership, y = poverty))+
  geom_point(aes(color = metro), size = 2)+
  theme_gray()+
  labs(title = "Plot H",
       x = "homeownership",
       y = "poverty"
  )

4 - Rental apartments in SF

credit_data <- read_csv(here("data", "credit.csv"))
#Glimpse of Data
#glimpse(credit_data)

#Part A: There appears to be a relationship between credit card balance and income. Generally, higher income means credit card balance will also be higher. This relationship varies a little depending on the student/married factors. Let's start with students. Students tend to have a higher credit card balance than non-students, so that shifts the starting point of the slope. Students also appear to have a slightly flatter slope. Non-married has a little steeper slope compared to married, but married income caps out higher.


#Note: I used this website to figure out how to label the axis: https://datavizpyr.com/dollar-format-for-axis-labels-with-ggplot2/
ggplot(credit_data, aes(x = income, y = balance, color = student, shape = student))+
  geom_point(size = 2, alpha = .5)+
  geom_smooth(method = "lm", aes(color = student), se = FALSE)+
  theme_gray()+
  theme(legend.position = "none")+
  facet_grid(rows = vars(student), cols = vars (married), label = labeller(
                student = c("No" = "student: No", "Yes" = "student: Yes"),
                married = c("No" = "married: No", "Yes" = "married: Yes"))) +
  labs(
    x = "Income",
    y = "Credit Card Balance" )+
  scale_x_continuous(labels = scales::dollar_format(scale = 1000))+
  scale_y_continuous(labels = scales::dollar_format())

#Part B: I think that these factors are useful indicators. For example, I would expect a student to have a higher credit card balance compared to a non-student. I would also expect married income to cap out much higher compared to non-married incomes.

#Part C

#First, make the new column and calculate the variable
credit_data$credit_utilization = credit_data$balance / credit_data$limit

ggplot(credit_data, aes(x = income, y = credit_utilization, color = student, shape = student))+
  geom_point(size = 2, alpha = .5)+
  geom_smooth(method = "lm", aes(color = student), se = FALSE)+
  theme_gray()+
  theme(legend.position = "none")+
  facet_grid(rows = vars(student), cols = vars (married), label = labeller(
                student = c("No" = "student: No", "Yes" = "student: Yes"),
                married = c("No" = "married: No", "Yes" = "married: Yes"))) +
  labs(
    x = "Income",
    y = "Credit Utilization" )+
  scale_x_continuous(labels = scales::dollar_format(scale = 1000))+
  scale_y_continuous(labels = scales::percent_format())

#Part D: This new plot implies that credit utilization is affected by marital status and student status, not simply just income. Students tend to use their credit more, even with higher levels of income. Non-students tend to utilize less credit, but slightly increase the use with higher income. Married credit use seems to be a lot more flat across the board compared to non-married credit, which has more variance but is also dependent on student vs non-student status.

5 - Napoleon’s march.